Introduction

Libraries

We are going to use:

library(tidyverse)  # data manipulation and visualization
library(plotly)     # plots in 3D
library(ggplot2)    # plots in 2D
library(ggpubr)     # to combine multiple ggplot objects (ggarrenge)
library(mvtnorm)    # to generate multivariate normal distribution
library(dr)         # SIR
library(factoextra) # PCA-related functions

Data

Let’s first define a function to generate Gaussian data. This function takes four arguments:

  • n: number of observations;
  • center: the mean vector
  • sigma: the covariance matrix
  • label: the cluster label
generateGaussianData <- function(n, center, sigma, label) {
  data = rmvnorm(n, center, sigma)
  data = data.frame(data)
  names(data) = c("x", "y", "z")
  data = data %>% mutate(class=factor(label))
  data
}

Now let’s simulate a dataset.

covmat <- matrix(c(1,0.88,0.88,0.88, 1,0.88,0.88,0.88, 1), 
       nrow = 3, byrow=T)

# cluster 1
n = 200
center = c(2, 8, 6)
sigma = covmat
group1 = generateGaussianData(n, center, sigma, 1)
  
# cluster 2
n = 200
center = c(4, 8, 6)
sigma = covmat
group2 = generateGaussianData(n, center, sigma, 2)

# cluster 3
n = 200
center = c(6, 8, 6)
sigma = covmat
group3 = generateGaussianData(n, center, sigma, 3)
  
# all data
df = bind_rows(group1, group2, group3)

head(df)
##           x        y        z class
## 1 1.4857568 8.016398 6.031452     1
## 2 0.3278638 6.522821 4.580953     1
## 3 1.3782497 7.458459 6.226041     1
## 4 2.6557762 8.763941 6.405316     1
## 5 2.2629447 8.175069 6.990975     1
## 6 2.8397608 8.713051 6.445625     1
summary(df)
##        x                 y                z         class  
##  Min.   :-0.1998   Min.   : 5.069   Min.   :2.125   1:200  
##  1st Qu.: 2.5501   1st Qu.: 7.375   1st Qu.:5.346   2:200  
##  Median : 3.9791   Median : 8.008   Median :5.994   3:200  
##  Mean   : 4.0333   Mean   : 8.028   Mean   :6.033          
##  3rd Qu.: 5.5042   3rd Qu.: 8.720   3rd Qu.:6.694          
##  Max.   : 8.3713   Max.   :10.978   Max.   :9.140

And plot our simulated data.

fig <- plot_ly(df, x = ~x, y = ~y, z = ~z, 
               color = ~class, colors = c('#b3e378', '#81e5f0', '#ed5391'))
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'x'),
                                   yaxis = list(title = 'y'),
                                   zaxis = list(title = 'z')))
fig

LDA vs PCA

LDA

Let’s perform LDA:

lda.df <- lda(factor(class) ~ x + y + z, data = df)
lda.df
## Call:
## lda(factor(class) ~ x + y + z, data = df)
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.3333333 0.3333333 0.3333333 
## 
## Group means:
##          x        y        z
## 1 2.093123 8.078746 6.081409
## 2 3.936263 7.937088 5.959751
## 3 6.070590 8.067554 6.056851
## 
## Coefficients of linear discriminants:
##         LD1           LD2
## x  2.362772  0.0008242327
## y -1.133210  1.3646797330
## z -1.114203 -0.4343873891
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9997 0.0003

Let us plot the projections on LD1 and LD2

# prediction on df to get projections
predmodel.lda = predict(lda.df, data=df)

# projections with LDA classes
estclass <- as.factor(apply(predmodel.lda$posterior, 1, which.max))
newdata2 <- data.frame(type = estclass, lda = predmodel.lda$x)
p1 <- ggplot(newdata2) + 
        geom_point(aes(lda.LD1, lda.LD2, colour = type), size = 2.5) +
        ggtitle("projections with LDA classes")

# projections with true classes
newdata <- data.frame(type = df$class, lda = predmodel.lda$x)
p2 <- ggplot(newdata) +
        geom_point(aes(lda.LD1, lda.LD2, colour = type), size = 2.5) +
        ggtitle("projections with true classes")


ggarrange(p1,p2,
          nrow=2)

PCA

Now let us perform PCA.

pc <- prcomp(df[,c(1,2,3)])
get_eig(pc)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  4.3887333        76.074162                    76.07416
## Dim.2  1.2629284        21.891560                    97.96572
## Dim.3  0.1173579         2.034277                   100.00000

This is the corresponding biplot.

fviz_pca_biplot(pc, col.var= "#2E9FDF", 
                col.ind= df$class, label="var")

Note that just considering the first principal component it is impossibile to notice differences within the three groups (all groups are overlapping).

SIR

Now we use the SIR (Sliced Inversion Regression) in the dr package

# default fitting method is "sir"
help(dr)

dr_res <- dr(class ~ x + y + z, data = df, method='sir')

dr_res
## 
##  dr(formula = class ~ x + y + z, data = df, method = "sir")
## Estimated Basis Vectors for Central Subspace:
##         Dir1          Dir2         Dir3
## x  0.8297682 -0.0005755226 -0.003033721
## y -0.3979656 -0.9528910951  0.628512122
## z -0.3912904  0.3033120994 -0.777793873
## Eigenvalues:
## [1] 9.371909e-01 4.253111e-03 6.278830e-18
plot(dr_res, col=df$class)

names(dr_res)
##  [1] "x"            "y"            "weights"      "method"       "cases"       
##  [6] "qr"           "group"        "chi2approx"   "evectors"     "evalues"     
## [11] "numdir"       "raw.evectors" "M"            "slice.info"   "call"        
## [16] "y.name"       "terms"